--- title: "Bernoulli or logistic regression?" author: "Hedibert Freitas Lopes" date: "2/8/2020" output: html_document: toc: true number_sections: true --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo=FALSE) ``` # Data structure The following data is taken from example 11 from Bolstad (2010) Understanding Computational Bayesian Statistics, Wiley Series in Computational Statistics. The data is on the age and coronary heart disease status for $n=100$ subjects taken from Hosmer and Lemeshow (1989) # Applied Logistic Regression, John Wiley and Sons. ```{r fig.width=9, fig.width=6} #install.packages("aplore3") library("aplore3") data(chdage) n = nrow(chdage) y = rep(0,100) y[chdage$chd=="Yes"]=1 x = chdage$age plot(x,y,xlab="Age",ylab="coronary heart disease status") ``` # Bernoulli model In this model, the 0/1 or Yes/no data ${\cal D}=\{y_1,\ldots,y_n\}$ are modeled as conditionally independent and identically distributed (iid) Bernoulli(\theta), i.e. $$ p(y_i|\theta) = \theta^{y_i}(1-\theta)^{1-y_i}, \qquad y_i \in \{0,1\}, \theta \in (0,1). $$ Therefore, the number of observed 1's, $s_n=\sum_{i=1}^n y_i$, follows, conditionally on $\theta$, a Binomial distribution, i.e. $$ p(s_n|\theta) = \frac{n!}{s_n!(n-s_n!)}\theta^{s_n}(1-\theta)^{n-s_n}, $$ for $s_n \in \{0,1,\ldots,n\}$. The likelihood function is, then, $$ L(\theta|s_n) \propto \theta^{s_n}(1-\theta)^{n-s_n}, $$ and it is easy to see that the maximum likelihood estimator of $\theta$ is $$ {\hat \theta}_{mle}=\frac{s_n}{n}. $$ ```{r fig.width=9, fig.width=6} sn = sum(y) theta.mle = sn/n c(sn,theta.mle) ``` ## Bayesian inference Bayesian inference is complete when a prior distribution for $\theta$ is set up. Let us use the conjugate prior for $\theta$, a $Beta(a_0,b_0)$, where $$ p(\theta) = \frac{\Gamma(a_0+b_0)}{\Gamma(a_0)\Gamma(b_0)}\theta^{a_0-1}(1-\theta)^{b_0-1}, \qquad \theta \in (0,1), a_0,b_0>0, $$ with prior mean and variance equal to $$ E(\theta) = \frac{a_0}{a_0+b_0} \ \ \mbox{and} \ \ V(\theta) = \frac{a_0b_0}{(a_0+b_0)^2(a_0+b_0+1)}. $$ ```{r fig.width=9, fig.width=6} sn = sum(y) a0 = 4 b0 = 16 a1 = a0+sn b1 = b0+n-sn # Posterior moments and density mean = a0/(a0+b0) stdev = sqrt(a0*b0/((a0+b0)^2*(a0+b0+1))) prior.mom = c(mean,stdev,qbeta(c(0.05,0.5,0.95),a0,b0)) mean = a1/(a1+b1) stdev = sqrt(a1*b1/((a1+b1)^2*(a1+b1+1))) qbeta.m1 = qbeta(c(0.05,0.5,0.95),a1,b1) post.mom = c(mean,stdev,qbeta.m1) table = round(rbind(prior.mom,post.mom),3) rownames(table) = c("Prior summaries","Posterior summaries") colnames(table) = c("Mean","St.Dev.","5%","50%","95%") table ``` ```{r fig.width=9, fig.width=6} thetas = seq(0,1,length=1000) plot(thetas,dbeta(thetas,a1,b1),xlab=expression(theta),ylab="Density",type="l",lwd=2) lines(thetas,dbeta(thetas,a0,b0),col=2,lwd=2) points(sn/n,0,pch=16,col=3,cex=2) legend("topright",legend=c("Data","Prior","Posterior"),col=3:1,lty=1,lwd=2) ``` ## Prior predictive of $s_n=\sum_{i=1}^n y_i$ given $(n,a_0,b_0)$ ```{r fig.width=9, fig.width=6} sn1 = 0:n term1 = gamma(a0+b0)/(gamma(a0)*gamma(b0)) term2 = gamma(a0+sn1)*gamma(b0+n-sn1)/gamma(a0+b0+n) term3 = gamma(n+1)/(gamma(sn1+1)*gamma(n-sn1+1)) prob.sn1 = term1*term2*term3 plot(sn1,prob.sn1,xlab=expression(s[n]),ylab="Prior predictive") points(sn,0,pch=16,col=2) legend("topright",legend=paste("Upper tail prob = ",round(sum(prob.sn1[sn1>=sn]),4),sep="")) ``` ## Posterior predictive of $y_{n+1}$ given $(a_1,b_1)$ It can be shown that $$ Pr(y_{n+1}=1|s_n,a_0,b_0) = \frac{a_0+s_n}{a_0+b_0+n} = \alpha+\beta {\bar y}_n, $$ where $\alpha=\alpha_0/(a_0+b_0+n)$ and $\beta=n/(a_0+b_0+n)$ and ${\bar y}_n = \frac{s_n}{n}$. ```{r fig.width=9, fig.width=6} prob.next = (a0+sn1)/(a0+b0+n) legend = paste("a0=",a0,",","b0=",b0,sep="") plot(sn1/n,prob.next,type="l",ylim=c(0,1), xlab="Sample proportion",ylab="Posterior predictive of one",lwd=2) a0=0;b0=0;prob.next=(a0+sn1)/(a0+b0+n) legend=c(legend,paste("a0=",a0,",","b0=",b0,sep="")) lines(sn1/n,prob.next,col=2,lwd=2) a0=30;b0=10;prob.next=(a0+sn1)/(a0+b0+n) legend=c(legend,paste("a0=",a0,",","b0=",b0,sep="")) lines(sn1/n,prob.next,col=3,lwd=2) legend("topleft",legend=legend,col=1:3,lwd=2) ``` # Logit and probit regression models In the simplest forms, the observation $y_i$ (coronary heart disease status) is related to the covariate $x_i$ (age) via either a logit or probit link function: \begin{eqnarray*} \theta_i &=& \frac{1}{1+e^{-(\beta_0+\beta_1 x_i)}}\\ \theta_i &=& \Phi(\beta_0+\beta_1 x_i), \end{eqnarray*} respectively. ```{r fig.width=9, fig.width=6} plot(x+rnorm(n,0,0.2),y,xlab="Age",ylab="coronary heart disease status") ``` ## Likelihood function of $(\beta_1,\beta_2)$ Given the data ${\cal D}_n=\{(x_1,y_1),\ldots,(x_n,y_n)_\}$, the likelihood function of $(\beta_0,\beta_1)$ is given by $$ L(\beta_0,\beta_1|{\cal D}_n) = \prod_{i=1}^n \theta_i^{y_i} (1-\theta_i)^{1-y_i}. $$ ```{r} loglikelihood.logit = function(beta){ eta = beta[1]+beta[2]*x theta = exp(eta)/(1+exp(eta)) return(sum(y*log(theta)+(1-y)*log(1-theta))) } loglikelihood.probit = function(beta){ theta = pnorm(beta[1]+beta[2]*x) return(sum(y*log(theta)+(1-y)*log(1-theta))) } minusloglike.logit = function(beta){-loglikelihood.logit(beta)} minusloglike.probit = function(beta){-loglikelihood.probit(beta)} beta.mle.logit = round(nlm(minusloglike.logit,c(-5,0.1))$estimate,4) beta.mle.probit = round(nlm(minusloglike.probit,c(-3,0.07))$estimate,4) print(paste("betas MLE (logit) : (",beta.mle.logit[1],",",beta.mle.logit[2],")",sep="")) print(paste("betas MLE (probit) : (",beta.mle.probit[1],",",beta.mle.probit[2],")",sep="")) ``` ```{r fig.width=9, fig.width=6} N = 100 beta0s = seq(-10,-1.5,length=N) beta1s = seq(0.01,0.2,length=N) loglike.logit = matrix(0,N,N) loglike.probit = matrix(0,N,N) for (i in 1:N) for (j in 1:N){ loglike.logit[i,j] = loglikelihood.logit(c(beta0s[i],beta1s[j])) loglike.probit[i,j] = loglikelihood.probit(c(beta0s[i],beta1s[j])) } par(mfrow=c(1,2)) contour(beta0s,beta1s,exp(loglike.logit),drawlabels=FALSE, xlab=expression(beta[0]),ylab=expression(beta[1]), main="Logit model") points(beta.mle.logit[1],beta.mle.logit[2],col=2,pch=16) contour(beta0s,beta1s,exp(loglike.probit),drawlabels=FALSE, xlab=expression(beta[0]),ylab=expression(beta[1]), main="Probit model") points(beta.mle.probit[1],beta.mle.probit[2],col=2,pch=16) ``` ## Fitted logit and probit regressions ```{r fig.width=9, fig.width=6} xx = 0:100 eta.logit = beta.mle.logit[1]+beta.mle.logit[2]*xx eta.probit = beta.mle.probit[1]+beta.mle.probit[2]*xx prob.logit = exp(eta.logit)/(1+exp(eta.logit)) prob.probit = pnorm(eta.probit) plot(x,y,xlab="Age",ylab="coronary heart disease status") lines(xx,prob.logit,col=2,lwd=2) lines(xx,prob.probit,col=3,lwd=2) ``` ## Bayesian inference via SIR Below we assume a simple bivariate uniform distribution for $(\beta_0,\beta_1)$ in the rectangle ${\cal A}=\{(-10,-1.5) \times (0.01,0.2)\}$ and use sampling importance resampling (SIR) to convert prior draws into resampled posterior draws. ```{r fig.width=9, fig.width=6} set.seed(2325) M0 = 20000 M = 10000 draws.beta0 = runif(M0,-10,-1.5) draws.beta1 = runif(M0,0.01,0.2) w.logit = rep(0,M0) w.probit = rep(0,M0) for (i in 1:M0){ w.logit[i] = loglikelihood.logit(c(draws.beta0[i],draws.beta1[i])) w.probit[i] = loglikelihood.probit(c(draws.beta0[i],draws.beta1[i])) } min = min(w.probit[!is.na(w.probit)]) w.probit[is.na(w.probit)]=min prob.m2.logit = mean(exp(w.logit)) prob.m2.probit = mean(exp(w.probit)) w.logit = exp(w.logit-max(w.logit)) w.probit = exp(w.probit-max(w.probit)) ind.logit = sample(1:M0,size=M,replace=TRUE,prob=w.logit) ind.probit = sample(1:M0,size=M,replace=TRUE,prob=w.probit) beta0.logit = draws.beta0[ind.logit] beta1.logit = draws.beta1[ind.logit] beta0.probit = draws.beta0[ind.probit] beta1.probit = draws.beta1[ind.probit] par(mfrow=c(1,2)) plot(beta0.logit,beta1.logit,xlab=expression(beta[0]),ylab=expression(beta[1])) points(beta.mle.logit[1],beta.mle.logit[2],col=2,pch=16) plot(beta0.probit,beta1.probit,xlab=expression(beta[0]),ylab=expression(beta[1])) points(beta.mle.probit[1],beta.mle.probit[2],col=2,pch=16) par(mfrow=c(2,2)) hist(beta0.logit,prob=TRUE,xlab="",main=expression(beta[0])) abline(v=beta.mle.logit[1],col=2,lwd=2) hist(beta1.logit,prob=TRUE,xlab="",main=expression(beta[1])) abline(v=beta.mle.logit[2],col=2,lwd=2) hist(beta0.probit,prob=TRUE,xlab="",main=expression(beta[0])) abline(v=beta.mle.probit[1],col=2,lwd=2) hist(beta1.probit,prob=TRUE,xlab="",main=expression(beta[1])) abline(v=beta.mle.probit[2],col=2,lwd=2) ``` # Model comparison ## Model comparison via posterior predictive summaries ```{r fig.width=9, fig.width=6} xx = 0:100 N = length(xx) prob.bayes.logit = matrix(0,M,N) prob.bayes.probit = matrix(0,M,N) for (i in 1:M){ prob.bayes.logit[i,] = 1/(1+exp(-beta0.logit[i]-beta1.logit[i]*xx)) prob.bayes.probit[i,] = pnorm(beta0.probit[i]+beta1.probit[i]*xx) } prob.bayes.logit = t(apply(prob.bayes.logit,2,quantile,c(0.05,0.5,0.95))) prob.bayes.probit = t(apply(prob.bayes.probit,2,quantile,c(0.05,0.5,0.95))) par(mfrow=c(1,1)) plot(x,y,xlim=range(xx),xlab="Age",ylab="coronary heart disease status") lines(xx,prob.bayes.logit[,2],col=2,lwd=2) lines(xx,prob.bayes.logit[,1],col=2,lwd=2,lty=2) lines(xx,prob.bayes.logit[,3],col=2,lwd=2,lty=2) lines(xx,prob.bayes.probit[,2],col=3,lwd=2) lines(xx,prob.bayes.probit[,1],col=3,lwd=2,lty=2) lines(xx,prob.bayes.probit[,3],col=3,lwd=2,lty=2) abline(h=qbeta.m1[1],col=6,lty=2) abline(h=qbeta.m1[2],col=6) abline(h=qbeta.m1[3],col=6,lty=2) legend("bottomright",legend=c("iid","logit","probit"),col=c(6,2,3),lty=1,lwd=2) #legend("topleft",legend=c("OLS","GLS","BAYES","Bernoulli"),col=c(2,4,3,6),lty=1,lwd=2) ``` ## Computing Bayes factors and posterior model probabilities ```{r} term1 = gamma(a0+b0)/(gamma(a0)*gamma(b0)) term2 = gamma(a0+sn)*gamma(b0+n-sn)/gamma(a0+b0+n) term3 = gamma(n+1)/(gamma(sn+1)*gamma(n-sn+1)) prob.m1 = term1*term2*term3/(gamma(n+1)/gamma(sn+1)/gamma(n-sn+1)) BF12 = prob.m1/prob.m2.logit BF32 = prob.m2.probit/prob.m2.logit PMP.m1 = prob.m1/(prob.m1+prob.m2.logit+prob.m2.probit) PMP.m2.logit = prob.m2.logit/(prob.m1+prob.m2.logit+prob.m2.probit) PMP.m2.probit= 1-PMP.m1-PMP.m2.logit print(paste("Prior predictive M1 = ",prob.m1,sep="")) print(paste("Prior predictive M2 logit = ",prob.m2.logit,sep="")) print(paste("Prior predictive M2 probit = ",prob.m2.probit,sep="")) print(paste("Bayes factor M1 vs M2 logit = ",round(BF12,3),sep="")) print(paste("Bayes factor M2 probit vs M2 logit = ",round(BF32,3),sep="")) print(paste("Posterior model probability for M1 = ",round(PMP.m1,3),sep="")) print(paste("Posterior model probability for M2 logit = ",round(PMP.m2.logit,3),sep="")) print(paste("Posterior model probability for M1 probit = ",round(PMP.m2.probit,3),sep="")) ```